library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(spatialOnco)
library(spatgraphs)
library(glmnetUtils)
library(brms)
## Loading required package: Rcpp
## Loading 'brms' package (version 2.16.3). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
##
## Attaching package: 'brms'
## The following object is masked from 'package:stats':
##
## ar
library(bayesplot)
## This is bayesplot version 1.8.1
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
library(DHARMa)
## This is DHARMa 0.4.4. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')
DATA_PATH = '../data/nbhd_coord_schurch_2020/'
RESULTS_PATH = '../results/bayes_glmnet_nbhds_01082022/'
dir.create(RESULTS_PATH)
## Warning in dir.create(RESULTS_PATH): '../results/bayes_glmnet_nbhds_01082022'
## already exists
df = read_csv(paste0(DATA_PATH,'CRC_master.csv'))
## New names:
## * `` -> ...1
## Rows: 258385 Columns: 83
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): spots, ClusterName
## dbl (81): ...1, CellID, patients, groups, size, CD44 - stroma, FOXP3 - regul...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
spot.labels = c("15_A","15_B","16_A","16_B")
keep_types = unique(df$ClusterName)
nbhds = make_spot_nbhds(df,spot.labels,keep_types, radius=50)
## Registered S3 method overwritten by 'spatstat.geom':
## method from
## print.boxx cli
cv.fit.4.1 = run_model(cv.glmnet,
file = paste0(RESULTS_PATH,"cv.fit.4.1"),
formula = tumor.cells ~ 1 + (CD4..T.cells.CD45RO. + CD68.CD163..macrophages + CD8..T.cells + Tregs + vasculature + CD11b.CD68..macrophages + CD68..macrophages + stroma + plasma.cells+spot)^2,
data = nbhds,
family = poisson())
plot(cv.fit.4.1)

# frequency of tumor cell counts
table(nbhds$tumor.cells)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 5083 1353 586 305 213 185 146 135 86 78 66 55 46 43 30 30
## 16 17 18 19 20 21 22
## 26 21 14 17 7 5 4
hist(nbhds$tumor.cells, breaks = 0:22)

# get an idea of the dispersion parameter in each spot
nbhds %>%
group_by(spot) %>%
summarise(d_tumor = var(tumor.cells)/mean(tumor.cells))
# This chunk mainly useful for GWR models (which I was unable to get working very well, the bandwidth estimation kept failing, likely because the design matrix was singular in some locations. Could try again)
cf = coef(cv.fit.4.2, s = "lambda.1se")
terms = rownames(cf)[cf[,1] != 0]
terms.keep = unique(str_replace_all(terms,"spot.*","spot"))
terms.keep = unique(terms[-grep("spot",terms)])
terms.keep = terms.keep[-1]
fm = paste0("tumor.cells ~ ",paste(terms.keep, collapse = " + "))
feats = nbhds %>%
named_group_split(spot, keep = FALSE)
coords = df %>%
rename(spot = spots) %>%
filter(spot %in% spot.labels) %>%
select(spot,X,Y) %>%
named_group_split(spot, keep = FALSE)
library(GWmodel)
spdfs = mapply(SpatialPointsDataFrame,coords,feats,SIMPLIFY = F)
spplot(spdfs[[1]],"tumor.cells")
More Bayesian models
fit.4.2 <- brm(formula = bf(tumor.cells ~ 1 + (CD4..T.cells.CD45RO. + CD68.CD163..macrophages + CD8..T.cells + Tregs + vasculature + CD11b.CD68..macrophages +spot)),
family = negbinomial(),
data = nbhds,
iter = 2000,
warmup = 1000,
cores = 4,
file = paste0(RESULTS_PATH,"fit.4.2")
) %>% add_criterion("loo")
summary(fit.4.2)
## Family: negbinomial
## Links: mu = log; shape = identity
## Formula: tumor.cells ~ 1 + (CD4..T.cells.CD45RO. + CD68.CD163..macrophages + CD8..T.cells + Tregs + vasculature + CD11b.CD68..macrophages + spot)
## Data: nbhds (Number of observations: 8534)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept 0.94 0.04 0.85 1.02 1.00 4478
## CD4..T.cells.CD45RO. -0.72 0.06 -0.83 -0.61 1.00 6746
## CD68.CD163..macrophages -0.46 0.02 -0.51 -0.42 1.00 5161
## CD8..T.cells -0.80 0.07 -0.93 -0.66 1.00 6971
## Tregs -0.46 0.05 -0.56 -0.36 1.00 7224
## vasculature -0.41 0.03 -0.46 -0.35 1.00 6901
## CD11b.CD68..macrophages -0.32 0.17 -0.66 0.01 1.00 8318
## spot15_B -1.33 0.07 -1.47 -1.20 1.00 4967
## spot16_A -0.22 0.06 -0.33 -0.11 1.00 5227
## spot16_B 0.51 0.05 0.41 0.60 1.00 4412
## Tail_ESS
## Intercept 3270
## CD4..T.cells.CD45RO. 2458
## CD68.CD163..macrophages 2573
## CD8..T.cells 2990
## Tregs 2713
## vasculature 2817
## CD11b.CD68..macrophages 2845
## spot15_B 3150
## spot16_A 3344
## spot16_B 3453
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape 0.49 0.01 0.46 0.52 1.00 6534 3153
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
fit.4.3 <- brm(formula = bf(tumor.cells ~ 1 + (CD4..T.cells.CD45RO. + CD68.CD163..macrophages + CD8..T.cells + Tregs + vasculature + CD11b.CD68..macrophages +spot)),
family = zero_inflated_negbinomial(),
data = nbhds,
iter = 2000,
warmup = 1000,
cores = 4,
file = paste0(RESULTS_PATH,"fit.4.3")
) %>% add_criterion("loo")
check_brms(fit.4.2)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

check_brms(fit.4.3)

summary(fit.4.3)
## Family: zero_inflated_negbinomial
## Links: mu = log; shape = identity; zi = identity
## Formula: tumor.cells ~ 1 + (CD4..T.cells.CD45RO. + CD68.CD163..macrophages + CD8..T.cells + Tregs + vasculature + CD11b.CD68..macrophages + spot)
## Data: nbhds (Number of observations: 8534)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept 1.34 0.04 1.25 1.42 1.00 2783
## CD4..T.cells.CD45RO. -0.75 0.05 -0.85 -0.64 1.00 4487
## CD68.CD163..macrophages -0.45 0.02 -0.49 -0.41 1.00 4481
## CD8..T.cells -0.79 0.06 -0.92 -0.67 1.00 4523
## Tregs -0.44 0.05 -0.54 -0.35 1.00 5462
## vasculature -0.41 0.03 -0.46 -0.36 1.00 5734
## CD11b.CD68..macrophages -0.26 0.17 -0.59 0.06 1.00 4845
## spot15_B -1.37 0.06 -1.49 -1.25 1.00 3523
## spot16_A -0.26 0.05 -0.36 -0.15 1.00 3284
## spot16_B 0.49 0.04 0.41 0.58 1.00 3250
## Tail_ESS
## Intercept 2672
## CD4..T.cells.CD45RO. 3019
## CD68.CD163..macrophages 3158
## CD8..T.cells 2946
## Tregs 3137
## vasculature 3261
## CD11b.CD68..macrophages 3269
## spot15_B 2675
## spot16_A 2733
## spot16_B 3192
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape 1.28 0.09 1.12 1.45 1.00 2723 2756
## zi 0.32 0.01 0.29 0.35 1.00 2373 2417
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Posterior predictive checks
y = nbhds$tumor.cells
yrep.4.2 <- posterior_predict(fit.4.2, ndraws = 500)
yrep.4.3 <- posterior_predict(fit.4.3, ndraws = 500)
ppc_dens_overlay(y,yrep.4.2)

ppc_dens_overlay(y,yrep.4.3)

prop_zero <- function(x) mean(x == 0)
prop_zero(y)
## [1] 0.5956175
ppc_stat(y, yrep.4.2, stat = "prop_zero", binwidth = 0.005)

ppc_stat(y, yrep.4.3, stat = "prop_zero", binwidth = 0.005)

dispersion <- function(x) var(x)/mean(x)
ppc_stat(y, yrep.4.2, stat = "dispersion", binwidth = 0.005)

ppc_stat(y, yrep.4.3, stat = "dispersion", binwidth = 0.005)

ppc_stat_grouped(y, yrep.4.3, group = nbhds$spot, stat = "prop_zero")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_stat_grouped(y, yrep.4.3, group = nbhds$spot, stat = "dispersion")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

d = nbhds[nbhds$spot == "15_A",]
fit.4.3b <- brm(formula = bf(tumor.cells ~ 1 + (CD4..T.cells.CD45RO. + CD68.CD163..macrophages + CD8..T.cells + Tregs + vasculature + CD11b.CD68..macrophages)),
family = zero_inflated_negbinomial(),
data = d,
iter = 2000,
warmup = 1000,
cores = 4,
file = paste0(RESULTS_PATH,"fit.4.3b")
) %>% add_criterion("loo")
check.4.3b = check_brms(fit.4.3b, plot = F)
testSpatialAutocorrelation(simulationOutput = check.4.3b, x = d$X, y= d$Y)

##
## DHARMa Moran's I test for distance-based autocorrelation
##
## data: check.4.3b
## observed = 0.06536117, expected = -0.00040161, sd = 0.00078674, p-value
## < 2.2e-16
## alternative hypothesis: Distance-based autocorrelation
fit.4.4 <- brm(formula = bf(tumor.cells ~ 1 + (CD4..T.cells.CD45RO. + CD68.CD163..macrophages + CD8..T.cells + Tregs + vasculature + CD11b.CD68..macrophages +spot), zi ~ spot),
family = zero_inflated_negbinomial(),
data = nbhds,
# control = list(adapt_delta = 0.99),
iter = 2000,
warmup = 1000,
cores = 4,
file = paste0(RESULTS_PATH,"fit.4.4")
) %>% add_criterion("loo")
## Registered S3 method overwritten by 'spdep':
## method from
## plot.mst ape
summary(fit.4.4)
## Family: zero_inflated_negbinomial
## Links: mu = log; shape = identity; zi = logit
## Formula: tumor.cells ~ 1 + (CD4..T.cells.CD45RO. + CD68.CD163..macrophages + CD8..T.cells + Tregs + vasculature + CD11b.CD68..macrophages + spot)
## zi ~ spot
## Data: nbhds (Number of observations: 8534)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept 1.36 0.05 1.26 1.45 1.00 2644
## zi_Intercept -0.62 0.10 -0.82 -0.43 1.00 2357
## CD4..T.cells.CD45RO. -0.74 0.05 -0.84 -0.63 1.00 5015
## CD68.CD163..macrophages -0.45 0.02 -0.49 -0.41 1.00 4592
## CD8..T.cells -0.79 0.06 -0.91 -0.66 1.00 4130
## Tregs -0.45 0.05 -0.54 -0.35 1.00 4792
## vasculature -0.41 0.03 -0.46 -0.35 1.00 4443
## CD11b.CD68..macrophages -0.30 0.16 -0.62 0.01 1.00 4793
## spot15_B -1.62 0.10 -1.81 -1.43 1.00 1055
## spot16_A -0.35 0.06 -0.46 -0.23 1.00 3003
## spot16_B 0.46 0.05 0.37 0.56 1.00 3040
## zi_spot15_B -1.87 2.63 -10.67 -0.44 1.01 438
## zi_spot16_A -0.42 0.15 -0.72 -0.14 1.00 2777
## zi_spot16_B -0.15 0.10 -0.34 0.05 1.00 2927
## Tail_ESS
## Intercept 2730
## zi_Intercept 2636
## CD4..T.cells.CD45RO. 3008
## CD68.CD163..macrophages 2818
## CD8..T.cells 2833
## Tregs 2898
## vasculature 2744
## CD11b.CD68..macrophages 2878
## spot15_B 998
## spot16_A 3294
## spot16_B 2808
## zi_spot15_B 173
## zi_spot16_A 2803
## zi_spot16_B 2604
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape 1.21 0.09 1.04 1.40 1.00 1735 2026
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
yrep.4.4 <- posterior_predict(fit.4.4, ndraws = 500)
ppc_stat(y, yrep.4.4, stat = "prop_zero", binwidth = 0.005)

ppc_stat(y, yrep.4.4, stat = "dispersion", binwidth = 0.005)

ppc_stat_grouped(y, yrep.4.4, group = nbhds$spot, stat = "prop_zero")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_stat_grouped(y, yrep.4.4, group = nbhds$spot, stat = "dispersion")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Spatial autocorrelation
d = nbhds[nbhds$spot == "15_A",]
fit.4.5 <- brm(formula = bf(tumor.cells ~ 1 + (CD4..T.cells.CD45RO. + CD68.CD163..macrophages + CD8..T.cells + Tregs + CD11b.CD68..macrophages), zi ~ 1),
family = zero_inflated_negbinomial(),
data = d,
# control = list(adapt_delta = 0.99),
iter = 2000,
warmup = 1000,
cores = 4,
file = paste0(RESULTS_PATH,"fit.4.5")
) %>% add_criterion("loo")
check.4.5 = check_brms(fit.4.5, plot = F)
testSpatialAutocorrelation(simulationOutput = check.4.5, x = d$X, y= d$Y)

##
## DHARMa Moran's I test for distance-based autocorrelation
##
## data: check.4.5
## observed = 0.06832943, expected = -0.00040161, sd = 0.00078674, p-value
## < 2.2e-16
## alternative hypothesis: Distance-based autocorrelation
tibble(X = d$X, Y = d$Y, res = check.4.5$scaledResiduals) %>%
ggplot(aes(x = X, y = Y, colour = res)) +
geom_point(alpha=0.7) +
scale_colour_gradient(low = "red", high = "blue")
